Hockey Goals

Fill in a module description here

Imports

# this next line shouldn't have to be here





import warnings

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from pandas.plotting import register_matplotlib_converters
from ydata_profiling import ProfileReport

register_matplotlib_converters()
sns.set()
sns.set_context("notebook")
plt.rcParams["figure.figsize"] = 10, 6
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.precision = 4
warnings.simplefilter(action="ignore", category=FutureWarning)

dollar_formatter = FuncFormatter(lambda x, pos: f"${x:,.0f}")
thousands_formatter = FuncFormatter(lambda x, pos: f"{x:,.0f}")

Constants

DAYS_IN_YEAR = 365.25

Data

Raw

game_goals_raw = pd.read_csv(
    "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-03/game_goals.csv"
)
top_250_raw = pd.read_csv(
    "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-03/top_250.csv"
)
season_goals_raw = pd.read_csv(
    "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-03/season_goals.csv"
)
game_goals_raw.head()

player season rank date game_num age team at opp location outcome goals assists points plus_minus penalty_min goals_even goals_powerplay goals_short goals_gamewinner assists_even assists_powerplay assists_short shots shot_percent
0 Alex Ovechkin 2006 1 2005-10-05 1 20-018 WSH NaN CBJ Home W 2 0 2 1 2 1 1 0 0 NaN NaN NaN 5 40.0
1 Alex Ovechkin 2006 2 2005-10-07 2 20-020 WSH NaN ATL Home L 0 1 1 -2 0 0 0 0 0 NaN NaN NaN 1 0.0
2 Alex Ovechkin 2006 3 2005-10-08 3 20-021 WSH @ ATL Away L 0 1 1 0 4 0 0 0 0 NaN NaN NaN 3 0.0
3 Alex Ovechkin 2006 4 2005-10-10 4 20-023 WSH NaN NYR Home W 1 0 1 1 2 0 1 0 1 NaN NaN NaN 6 16.7
4 Alex Ovechkin 2006 5 2005-10-12 5 20-025 WSH @ CAR Away L 1 0 1 0 0 1 0 0 0 NaN NaN NaN 6 16.7
top_250_raw.head()

raw_rank player years total_goals url_number raw_link link active yr_start
0 1.0 Wayne Gretzky 1979-99 894 1 /players/g/gretzwa01.html https://www.hockey-reference.com/players/g/gre... Retired 1979
1 2.0 Gordie Howe 1946-80 801 2 /players/h/howego01.html https://www.hockey-reference.com/players/h/how... Retired 1946
2 3.0 Jaromir Jagr 1990-18 766 3 /players/j/jagrja01.html https://www.hockey-reference.com/players/j/jag... Retired 1990
3 4.0 Brett Hull 1986-06 741 4 /players/h/hullbr01.html https://www.hockey-reference.com/players/h/hul... Retired 1986
4 5.0 Marcel Dionne 1971-89 731 5 /players/d/dionnma01.html https://www.hockey-reference.com/players/d/dio... Retired 1971
season_goals_raw.head()

rank position hand player years total_goals status yr_start season age team league season_games goals assists points plus_minus penalty_min goals_even goals_power_play goals_short_handed goals_game_winner headshot
0 1 C Left Wayne Gretzky 1979-99 894 Retired 1979 1978-79 18 TOT WHA 80 46 64 110 20.0 19 NaN NaN NaN NaN https://d9kjk42l7bfqz.cloudfront.net/req/20191...
1 1 C Left Wayne Gretzky 1979-99 894 Retired 1979 1978-79 18 INR WHA 8 3 3 6 -3.0 0 3.0 0.0 0.0 NaN https://d9kjk42l7bfqz.cloudfront.net/req/20191...
2 1 C Left Wayne Gretzky 1979-99 894 Retired 1979 1978-79 18 EDO WHA 72 43 61 104 23.0 19 34.0 9.0 0.0 NaN https://d9kjk42l7bfqz.cloudfront.net/req/20191...
3 1 C Left Wayne Gretzky 1979-99 894 Retired 1979 1979-80 19 EDM NHL 79 51 86 137 14.0 21 37.0 13.0 1.0 6.0 https://d9kjk42l7bfqz.cloudfront.net/req/20191...
4 1 C Left Wayne Gretzky 1979-99 894 Retired 1979 1980-81 20 EDM NHL 80 55 109 164 41.0 28 36.0 15.0 4.0 3.0 https://d9kjk42l7bfqz.cloudfront.net/req/20191...

Functions

def raw_game_to_clean(raw):
    return raw.assign(
        years_old=(
            (
                ((raw["age"].str.split("-").str[0].astype(float)) * DAYS_IN_YEAR)
                + raw["age"].str.split("-").str[1].astype(float)
            )
            / DAYS_IN_YEAR
        )
    )


def raw_top_to_clean(raw):
    # prob ffill raw_rank
    return raw


def raw_season_to_clean(raw):
    return raw


def list_of_top_n_playernames(top_n=8):
    return top_250.iloc[:top_n]["player"].to_list()
game_goals = raw_game_to_clean(game_goals_raw)
top_250 = raw_top_to_clean(top_250_raw)
season_goals = raw_season_to_clean(season_goals_raw)

EDA

game_profile = ProfileReport(game_goals, config_file="config_minimal.yaml")
top_profile = ProfileReport(top_250, config_file="config_minimal.yaml")
season_profile = ProfileReport(season_goals, config_file="config_minimal.yaml")
game_profile


<class 'ydata_profiling.profile_report.ProfileReport'>.__repr__ returned empty string
top_profile


<class 'ydata_profiling.profile_report.ProfileReport'>.__repr__ returned empty string
season_profile


<class 'ydata_profiling.profile_report.ProfileReport'>.__repr__ returned empty string

Plots

Warning – dataset might have problems – the sum of the goals in the seasons don’t match the top dataset?

for player_name in list_of_top_n_playernames(top_n=8):
    sum_goals = season_goals[season_goals["player"] == player_name]["goals"].sum()
    top_goals = top_250[top_250["player"] == player_name]["total_goals"].values[0]
    print(f"{top_goals - sum_goals}, {player_name}")
-115, Wayne Gretzky
-174, Gordie Howe
-64, Jaromir Jagr
-32, Brett Hull
-28, Marcel Dionne
-35, Phil Esposito
-139, Mike Gartner
0, Alex Ovechkin

What do the cumulative goals scored look like as a function of age for the top scorers?

# player_name = "Alex Ovechkin"
fig, ax = plt.subplots(figsize=(10, 6))
fig.patch.set_facecolor("w")

for player_name in list_of_top_n_playernames(top_n=8):
    plotdf = game_goals[game_goals["player"] == player_name]
    ax.plot(plotdf["years_old"], plotdf["goals"].cumsum(), label=player_name)

box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
ax.set_title("Total Goals Scored by Age")
ax.set_xlabel("Age [Years]")
ax.set_ylabel("Goals Scored")
fig.tight_layout()

Clean up Notebook